R Markdown

library(mfp)
## Loading required package: survival
library(lattice)
library(AUC)
## AUC 0.3.0
## Type AUCNews() to see the change log and ?AUC to get an overview.
library(plyr)
library(MASS)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
library(mvtnorm)
library(car)
library(caret)
## Loading required package: ggplot2
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:AUC':
## 
##     sensitivity, specificity
## The following object is masked from 'package:survival':
## 
##     cluster

Попарные диаграммы рассеяния всех количественных признаков:

wine_data = read.csv('wine.csv')

colnames(wine_data) = c("X","type","constant_acidity","acetic_acidity","critic_acid_amount","residual_sugar","chlorides","free_sulfur_dioxide","total_sulfur_dioxide","density","pH","sulphates","alcohol","grade")

wine_data$X = NULL
wine_data$type = as.integer(wine_data$type == "красное")

panel.hist <- function(x, ...){
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks; nB <- length(breaks)
    y <- h$counts; y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = "red", ...)
}

panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- abs(cor(x, y))
    txt <- format(c(r, 0.123456789), digits = digits)[1]
    txt <- paste0(prefix, txt)
    if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
    text(0.5, 0.5, txt, cex = cex.cor * r)
}

mycol <- rgb(30,30,30,100,maxColorValue=255)

panel.dots <- function(x, y, ...){
  points(x, y, pch=19, col=mycol)
}
wine_data$below_grade = as.integer(wine_data$grade < 6)
wine_data$above_grade = as.integer(wine_data$grade > 6)
wine_data$grade = NULL

pairs(wine_data[,c("constant_acidity","acetic_acidity","critic_acid_amount","residual_sugar","chlorides","free_sulfur_dioxide","total_sulfur_dioxide","density","pH","sulphates","alcohol")], diag.panel=panel.hist,upper.panel = panel.cor, lower.panel = panel.dots)

Видна корелляция между содержанием алкоголя и плотностью.

Посмотрим на распределение занчений целевой переменной.

#par(mfrow=c(1,2))
hist(wine_data$alcohol, col="red", main="", xlab="alcohol", breaks=50)

Видно что оно не очень нормальное – надо примениить преобразование отклика методом Бокса-Кокса. Кроме того, сразу видим, что имеется выброс с максимальным значением по признаку алкогольности, выбросим его сразу перед применением преобразования.

После этих действий, снова построим распределение преобразованной алкогольности.

wine_data = subset(wine_data, wine_data$alcohol != 14.9) # удаляем как выброс
wine_data$alcohol = predict(BoxCoxTrans(wine_data$alcohol), wine_data$alcohol)
hist(wine_data$alcohol, col="red", main="", xlab="alcohol", breaks=50)

После применения преобразования распределение стало больше похоже на нормальное.

Теперь построим линейную модель по всем признакам:

#Создание модели
m1 = lm(alcohol ~., data=wine_data)

##Значимость признаков

get_multi_hyotesys_corrected_p_value = function(m, EstType){
  beta  = coef(m)[-1]
  Vbeta = vcovHC(m, type = EstType)[-1,-1]
  D = diag(1 / sqrt(diag(Vbeta)))
  t = D %*% beta
  Cor = D %*% Vbeta %*% t(D)
  m.df = length(m$residuals) - length(beta)
  p_adj = sapply(abs(t), function(x) 1-pmvt(-rep(x, length(beta)), rep(x, length(beta)), corr = Cor, df = m.df))
  c(NaN, p_adj)
}

s1 <-summary(m1)
s1$coefficients <- cbind(s1$coefficients, get_multi_hyotesys_corrected_p_value(m1, "HC0"))
dimnames(s1$coefficients)[[2]][5] <- "Adjusted p-value"
s1
## 
## Call:
## lm(formula = alcohol ~ ., data = wine_data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0045807 -0.0003954 -0.0000098  0.0003744  0.0206682 
## 
## Coefficients:
##                         Estimate  Std. Error     t value    Pr(>|t|)
## (Intercept)            1.373e+00   7.421e-03   1.851e+02   0.000e+00
## type                   1.423e-03   5.016e-05   2.838e+01  4.350e-167
## constant_acidity       7.166e-04   1.212e-05   5.912e+01   0.000e+00
## acetic_acidity         8.289e-04   7.828e-05   1.059e+01   5.476e-26
## critic_acid_amount     4.996e-04   7.528e-05   6.637e+00   3.471e-11
## residual_sugar         2.812e-04   4.097e-06   6.863e+01   0.000e+00
## chlorides             -1.582e-03   3.178e-04  -4.977e+00   6.634e-07
## free_sulfur_dioxide   -4.330e-06   7.286e-07  -5.943e+00   2.952e-09
## total_sulfur_dioxide  -4.067e-07   3.088e-07  -1.317e+00   1.879e-01
## density               -8.503e-01   7.638e-03  -1.113e+02   0.000e+00
## pH                     3.619e-03   7.387e-05   4.900e+01   0.000e+00
## sulphates              1.341e-03   7.111e-05   1.886e+01   2.642e-77
## below_grade           -2.612e-04   2.102e-05  -1.243e+01   4.614e-35
## above_grade            2.559e-05   2.476e-05   1.034e+00   3.014e-01
##                      Adjusted p-value
## (Intercept)                        NA
## type                            0.000
## constant_acidity                0.000
## acetic_acidity                  0.003
## critic_acid_amount              0.003
## residual_sugar                  0.000
## chlorides                       0.023
## free_sulfur_dioxide             0.000
## total_sulfur_dioxide            0.998
## density                         0.000
## pH                              0.000
## sulphates                       0.000
## below_grade                     0.000
## above_grade                     0.987
## 
## Residual standard error: 0.0006971 on 6482 degrees of freedom
## Multiple R-squared:  0.8117, Adjusted R-squared:  0.8113 
## F-statistic:  2149 on 13 and 6482 DF,  p-value: < 2.2e-16

Для отсева выбросов строим график расстояния Кука.

##Расстояние кука

plot(fitted(m1), cooks.distance(m1), xlab="Fitted alcohol", ylab="Cook's distance")
lines(c(0,100), c(0.015, 0.015), col="red")

plot(wine_data$alcohol, cooks.distance(m1), xlab="alcohol", ylab="Cook's distance")
lines(c(0,100), c(0.015, 0.015), col="red")

Видим выбросы и пытаемся отфильтровать их по расстоянию кука, построив после этого снова графиик расстояния Кука для новой модели.

## [1] 4469

Теперь можно считать, что выбросов нет.

Построим значения трех критериев для остатков модели.

Критерий p
Шапиро-Уилка 5.052935410^{-15}
Уилкоксона 0.6866825
Бройша-Пагана 5.531778610^{-28}

Остатки ненормальны и гетероскедастичны, поэтому оценку значимости признаков будем делать с дисперсиями Уайта. Также будем делать поправку на множественность.

Посмотрим на графики остатков и попытаемся найти квадратичные зависимости:

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.0022e-28
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

Видим, что у признаков residual_sugar и chlorides тренды зависимостей похожи на квадратичные, поэтому добавим это в новую модель.

m2 <- lm(alcohol ~ . + I(residual_sugar^2) + I(chlorides^2) , data=wine_data)

Произведем сравнение моделей по критерию Вальда с дисперсиями Уайта:

#Сравнение моделей
waldtest(m2, m1, vcov = vcovHC(m2, type = "HC0"))
## Wald test
## 
## Model 1: alcohol ~ type + constant_acidity + acetic_acidity + critic_acid_amount + 
##     residual_sugar + chlorides + free_sulfur_dioxide + total_sulfur_dioxide + 
##     density + pH + sulphates + below_grade + above_grade + I(residual_sugar^2) + 
##     I(chlorides^2)
## Model 2: alcohol ~ type + constant_acidity + acetic_acidity + critic_acid_amount + 
##     residual_sugar + chlorides + free_sulfur_dioxide + total_sulfur_dioxide + 
##     density + pH + sulphates + below_grade + above_grade
##   Res.Df Df      F   Pr(>F)    
## 1   4453                       
## 2   4455 -2 16.483 7.38e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Модель получилась значимо лучше.

Вот графики ошибок новой модели:

##Графики ошибок
plot(1:dim(wine_data)[1], rstandard(m2), xlab="i", ylab="Standardized residuals", col=mycol, pch=19)
addtrend(1:dim(wine_data)[1], rstandard(m2))
grid()

plot(fitted(m2), rstandard(m2), xlab="Fitted values", ylab="Standardized residuals", col=mycol, pch=19)
addtrend(fitted(m2), rstandard(m2))
grid()

for (col_name in colnames(wine_data)) {
    if (col_name != "alcohol") {
        plot(wine_data[,col_name], rstandard(m2), xlab=col_name, ylab="Standardized residuals", col=mycol, pch=19)
        addtrend(wine_data[,col_name], rstandard(m2))
        grid()
    }
}
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.0022e-28
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

Посмотрим на значимость признаков полученной модели, не забывая про поправку на множественность:

##Значимость признаков второй модели
s1 =summary(m2)
s1$coefficients = cbind(s1$coefficients, get_multi_hyotesys_corrected_p_value(m2, "HC0"))
dimnames(s1$coefficients)[[2]][5] = "Adjusted p-value"
s1
## 
## Call:
## lm(formula = alcohol ~ . + I(residual_sugar^2) + I(chlorides^2), 
##     data = wine_data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -8.902e-04 -2.335e-04 -9.520e-06  2.339e-04  9.044e-04 
## 
## Coefficients:
##                         Estimate  Std. Error     t value    Pr(>|t|)
## (Intercept)            1.432e+00   4.563e-03   3.138e+02   0.000e+00
## type                   1.660e-03   3.169e-05   5.239e+01   0.000e+00
## constant_acidity       7.420e-04   7.386e-06   1.005e+02   0.000e+00
## acetic_acidity         5.316e-04   4.875e-05   1.090e+01   2.413e-27
## critic_acid_amount     3.661e-04   4.539e-05   8.066e+00   9.293e-16
## residual_sugar         3.162e-04   4.343e-06   7.281e+01   0.000e+00
## chlorides              3.660e-04   5.033e-04   7.271e-01   4.672e-01
## free_sulfur_dioxide   -4.186e-06   4.277e-07  -9.787e+00   2.157e-22
## total_sulfur_dioxide   9.427e-08   1.833e-07   5.144e-01   6.070e-01
## density               -9.092e-01   4.692e-03  -1.938e+02   0.000e+00
## pH                     3.531e-03   4.291e-05   8.228e+01   0.000e+00
## sulphates              1.332e-03   4.356e-05   3.058e+01  1.283e-186
## below_grade           -2.054e-04   1.201e-05  -1.709e+01   1.723e-63
## above_grade           -3.599e-05   1.321e-05  -2.724e+00   6.465e-03
## I(residual_sugar^2)   -1.021e-06   2.273e-07  -4.490e+00   7.287e-06
## I(chlorides^2)        -3.308e-03   1.520e-03  -2.177e+00   2.953e-02
##                      Adjusted p-value
## (Intercept)                        NA
## type                            0.000
## constant_acidity                0.000
## acetic_acidity                  0.000
## critic_acid_amount              0.000
## residual_sugar                  0.000
## chlorides                       0.996
## free_sulfur_dioxide             0.000
## total_sulfur_dioxide            1.000
## density                         0.000
## pH                              0.000
## sulphates                       0.000
## below_grade                     0.000
## above_grade                     0.040
## I(residual_sugar^2)             0.000
## I(chlorides^2)                  0.009
## 
## Residual standard error: 0.0003165 on 4453 degrees of freedom
## Multiple R-squared:  0.9579, Adjusted R-squared:  0.9578 
## F-statistic:  6755 on 15 and 4453 DF,  p-value: < 2.2e-16
add1(m2, ~ .^2, test="F")
## Single term additions
## 
## Model:
## alcohol ~ type + constant_acidity + acetic_acidity + critic_acid_amount + 
##     residual_sugar + chlorides + free_sulfur_dioxide + total_sulfur_dioxide + 
##     density + pH + sulphates + below_grade + above_grade + I(residual_sugar^2) + 
##     I(chlorides^2)
##                                          Df  Sum of Sq        RSS    AIC
## <none>                                                 0.00044609 -72008
## type:constant_acidity                     1 9.0000e-10 0.00044609 -72006
## type:acetic_acidity                       1 5.2000e-08 0.00044604 -72006
## type:critic_acid_amount                   1 6.4400e-08 0.00044602 -72007
## type:residual_sugar                       1 6.1300e-08 0.00044603 -72007
## type:chlorides                            1 2.4500e-08 0.00044606 -72006
## type:free_sulfur_dioxide                  1 3.5680e-07 0.00044573 -72009
## type:total_sulfur_dioxide                 1 1.8643e-06 0.00044422 -72025
## type:density                              1 6.0290e-07 0.00044549 -72012
## type:pH                                   1 2.1271e-06 0.00044396 -72027
## type:sulphates                            1 2.6703e-06 0.00044342 -72033
## type:below_grade                          1 6.5050e-07 0.00044544 -72012
## type:above_grade                          1 1.1543e-06 0.00044493 -72017
## type:I(residual_sugar^2)                  1 4.6600e-08 0.00044604 -72006
## type:I(chlorides^2)                       1 2.8200e-08 0.00044606 -72006
## constant_acidity:acetic_acidity           1 4.1000e-09 0.00044608 -72006
## constant_acidity:critic_acid_amount       1 5.2030e-07 0.00044557 -72011
## constant_acidity:residual_sugar           1 1.4200e-08 0.00044607 -72006
## constant_acidity:chlorides                1 0.0000e+00 0.00044609 -72006
## constant_acidity:free_sulfur_dioxide      1 2.6450e-07 0.00044582 -72009
## constant_acidity:total_sulfur_dioxide     1 2.8880e-07 0.00044580 -72009
## constant_acidity:density                  1 4.5200e-08 0.00044604 -72006
## constant_acidity:pH                       1 3.5010e-07 0.00044574 -72009
## constant_acidity:sulphates                1 9.7960e-07 0.00044511 -72016
## constant_acidity:below_grade              1 8.2330e-07 0.00044526 -72014
## constant_acidity:above_grade              1 1.0874e-06 0.00044500 -72017
## constant_acidity:I(residual_sugar^2)      1 7.8000e-09 0.00044608 -72006
## constant_acidity:I(chlorides^2)           1 1.0700e-08 0.00044608 -72006
## acetic_acidity:critic_acid_amount         1 4.6120e-07 0.00044563 -72011
## acetic_acidity:residual_sugar             1 3.1000e-09 0.00044608 -72006
## acetic_acidity:chlorides                  1 6.6000e-08 0.00044602 -72007
## acetic_acidity:free_sulfur_dioxide        1 2.6960e-07 0.00044582 -72009
## acetic_acidity:total_sulfur_dioxide       1 1.4815e-06 0.00044461 -72021
## acetic_acidity:density                    1 5.1800e-07 0.00044557 -72011
## acetic_acidity:pH                         1 9.9970e-07 0.00044509 -72016
## acetic_acidity:sulphates                  1 7.3010e-07 0.00044536 -72013
## acetic_acidity:below_grade                1 1.3530e-07 0.00044595 -72007
## acetic_acidity:above_grade                1 9.5900e-08 0.00044599 -72007
## acetic_acidity:I(residual_sugar^2)        1 2.1210e-07 0.00044588 -72008
## acetic_acidity:I(chlorides^2)             1 1.4100e-08 0.00044607 -72006
## critic_acid_amount:residual_sugar         1 1.9400e-08 0.00044607 -72006
## critic_acid_amount:chlorides              1 2.5000e-09 0.00044609 -72006
## critic_acid_amount:free_sulfur_dioxide    1 7.6400e-08 0.00044601 -72007
## critic_acid_amount:total_sulfur_dioxide   1 1.2800e-08 0.00044608 -72006
## critic_acid_amount:density                1 2.2000e-09 0.00044609 -72006
## critic_acid_amount:pH                     1 1.3840e-07 0.00044595 -72007
## critic_acid_amount:sulphates              1 4.1800e-08 0.00044605 -72006
## critic_acid_amount:below_grade            1 2.0570e-07 0.00044588 -72008
## critic_acid_amount:above_grade            1 2.0840e-07 0.00044588 -72008
## critic_acid_amount:I(residual_sugar^2)    1 7.7000e-09 0.00044608 -72006
## critic_acid_amount:I(chlorides^2)         1 0.0000e+00 0.00044609 -72006
## residual_sugar:chlorides                  1 3.0080e-07 0.00044579 -72009
## residual_sugar:free_sulfur_dioxide        1 2.0170e-07 0.00044589 -72008
## residual_sugar:total_sulfur_dioxide       1 8.8880e-07 0.00044520 -72015
## residual_sugar:density                    1 2.5604e-06 0.00044353 -72032
## residual_sugar:pH                         1 4.5410e-07 0.00044563 -72010
## residual_sugar:sulphates                  1 2.1600e-08 0.00044607 -72006
## residual_sugar:below_grade                1 2.2270e-07 0.00044587 -72008
## residual_sugar:above_grade                1 6.5870e-07 0.00044543 -72013
## residual_sugar:I(residual_sugar^2)        1 1.1980e-07 0.00044597 -72007
## residual_sugar:I(chlorides^2)             1 7.0900e-08 0.00044602 -72007
## chlorides:free_sulfur_dioxide             1 3.1020e-07 0.00044578 -72009
## chlorides:total_sulfur_dioxide            1 8.4930e-07 0.00044524 -72014
## chlorides:density                         1 0.0000e+00 0.00044609 -72006
## chlorides:pH                              1 3.4618e-06 0.00044263 -72041
## chlorides:sulphates                       1 1.3622e-06 0.00044473 -72020
## chlorides:below_grade                     1 9.1760e-07 0.00044517 -72015
## chlorides:above_grade                     1 1.2676e-06 0.00044482 -72019
## chlorides:I(residual_sugar^2)             1 4.8090e-07 0.00044561 -72011
## chlorides:I(chlorides^2)                  1 2.1400e-08 0.00044607 -72006
## free_sulfur_dioxide:total_sulfur_dioxide  1 3.4100e-08 0.00044605 -72006
## free_sulfur_dioxide:density               1 6.8800e-07 0.00044540 -72013
## free_sulfur_dioxide:pH                    1 4.2410e-07 0.00044566 -72010
## free_sulfur_dioxide:sulphates             1 9.9770e-07 0.00044509 -72016
## free_sulfur_dioxide:below_grade           1 9.5000e-09 0.00044608 -72006
## free_sulfur_dioxide:above_grade           1 2.6000e-09 0.00044609 -72006
## free_sulfur_dioxide:I(residual_sugar^2)   1 2.3290e-07 0.00044586 -72008
## free_sulfur_dioxide:I(chlorides^2)        1 1.0420e-07 0.00044598 -72007
## total_sulfur_dioxide:density              1 5.3754e-06 0.00044071 -72060
## total_sulfur_dioxide:pH                   1 1.1000e-09 0.00044609 -72006
## total_sulfur_dioxide:sulphates            1 8.2600e-07 0.00044526 -72014
## total_sulfur_dioxide:below_grade          1 1.6900e-08 0.00044607 -72006
## total_sulfur_dioxide:above_grade          1 1.1300e-08 0.00044608 -72006
## total_sulfur_dioxide:I(residual_sugar^2)  1 3.5260e-07 0.00044574 -72009
## total_sulfur_dioxide:I(chlorides^2)       1 1.6630e-07 0.00044592 -72008
## density:pH                                1 7.2215e-06 0.00043887 -72079
## density:sulphates                         1 3.5252e-06 0.00044256 -72041
## density:below_grade                       1 1.3545e-06 0.00044473 -72019
## density:above_grade                       1 2.1661e-06 0.00044392 -72028
## density:I(residual_sugar^2)               1 1.2864e-06 0.00044480 -72019
## density:I(chlorides^2)                    1 9.5800e-08 0.00044599 -72007
## pH:sulphates                              1 5.5670e-07 0.00044553 -72011
## pH:below_grade                            1 1.5486e-06 0.00044454 -72021
## pH:above_grade                            1 2.3691e-06 0.00044372 -72030
## pH:I(residual_sugar^2)                    1 4.9210e-07 0.00044560 -72011
## pH:I(chlorides^2)                         1 1.0284e-06 0.00044506 -72016
## sulphates:below_grade                     1 2.9220e-07 0.00044580 -72009
## sulphates:above_grade                     1 7.7900e-08 0.00044601 -72007
## sulphates:I(residual_sugar^2)             1 4.7500e-08 0.00044604 -72006
## sulphates:I(chlorides^2)                  1 2.0970e-07 0.00044588 -72008
## below_grade:above_grade                   0 0.0000e+00 0.00044609 -72008
## below_grade:I(residual_sugar^2)           1 9.2000e-09 0.00044608 -72006
## below_grade:I(chlorides^2)                1 1.9040e-07 0.00044590 -72008
## above_grade:I(residual_sugar^2)           1 2.7230e-07 0.00044582 -72009
## above_grade:I(chlorides^2)                1 1.6370e-07 0.00044592 -72008
## I(residual_sugar^2):I(chlorides^2)        1 1.3330e-07 0.00044595 -72007
##                                          F value    Pr(>F)    
## <none>                                                        
## type:constant_acidity                     0.0094 0.9226003    
## type:acetic_acidity                       0.5194 0.4711304    
## type:critic_acid_amount                   0.6426 0.4228245    
## type:residual_sugar                       0.6118 0.4341398    
## type:chlorides                            0.2442 0.6212160    
## type:free_sulfur_dioxide                  3.5640 0.0591111 .  
## type:total_sulfur_dioxide                18.6842 1.576e-05 ***
## type:density                              6.0251 0.0141415 *  
## type:pH                                  21.3301 3.975e-06 ***
## type:sulphates                           26.8106 2.343e-07 ***
## type:below_grade                          6.5013 0.0108128 *  
## type:above_grade                         11.5494 0.0006836 ***
## type:I(residual_sugar^2)                  0.4652 0.4952471    
## type:I(chlorides^2)                       0.2819 0.5955113    
## constant_acidity:acetic_acidity           0.0408 0.8400131    
## constant_acidity:critic_acid_amount       5.1982 0.0226571 *  
## constant_acidity:residual_sugar           0.1420 0.7062802    
## constant_acidity:chlorides                0.0002 0.9891574    
## constant_acidity:free_sulfur_dioxide      2.6412 0.1041974    
## constant_acidity:total_sulfur_dioxide     2.8845 0.0895055 .  
## constant_acidity:density                  0.4513 0.5017584    
## constant_acidity:pH                       3.4972 0.0615399 .  
## constant_acidity:sulphates                9.7982 0.0017581 ** 
## constant_acidity:below_grade              8.2314 0.0041366 ** 
## constant_acidity:above_grade             10.8792 0.0009801 ***
## constant_acidity:I(residual_sugar^2)      0.0780 0.7800876    
## constant_acidity:I(chlorides^2)           0.1072 0.7434242    
## acetic_acidity:critic_acid_amount         4.6074 0.0318879 *  
## acetic_acidity:residual_sugar             0.0305 0.8613962    
## acetic_acidity:chlorides                  0.6591 0.4169188    
## acetic_acidity:free_sulfur_dioxide        2.6923 0.1009055    
## acetic_acidity:total_sulfur_dioxide      14.8344 0.0001190 ***
## acetic_acidity:density                    5.1762 0.0229458 *  
## acetic_acidity:pH                         9.9994 0.0015765 ** 
## acetic_acidity:sulphates                  7.2989 0.0069258 ** 
## acetic_acidity:below_grade                1.3506 0.2452359    
## acetic_acidity:above_grade                0.9576 0.3278464    
## acetic_acidity:I(residual_sugar^2)        2.1182 0.1456274    
## acetic_acidity:I(chlorides^2)             0.1408 0.7074931    
## critic_acid_amount:residual_sugar         0.1941 0.6595702    
## critic_acid_amount:chlorides              0.0253 0.8735654    
## critic_acid_amount:free_sulfur_dioxide    0.7623 0.3826677    
## critic_acid_amount:total_sulfur_dioxide   0.1277 0.7208939    
## critic_acid_amount:density                0.0219 0.8823505    
## critic_acid_amount:pH                     1.3819 0.2398437    
## critic_acid_amount:sulphates              0.4172 0.5183860    
## critic_acid_amount:below_grade            2.0543 0.1518467    
## critic_acid_amount:above_grade            2.0813 0.1491818    
## critic_acid_amount:I(residual_sugar^2)    0.0769 0.7815563    
## critic_acid_amount:I(chlorides^2)         0.0002 0.9894453    
## residual_sugar:chlorides                  3.0036 0.0831487 .  
## residual_sugar:free_sulfur_dioxide        2.0139 0.1559334    
## residual_sugar:total_sulfur_dioxide       8.8883 0.0028855 ** 
## residual_sugar:density                   25.7005 4.149e-07 ***
## residual_sugar:pH                         4.5364 0.0332354 *  
## residual_sugar:sulphates                  0.2156 0.6424614    
## residual_sugar:below_grade                2.2239 0.1359596    
## residual_sugar:above_grade                6.5836 0.0103246 *  
## residual_sugar:I(residual_sugar^2)        1.1963 0.2741307    
## residual_sugar:I(chlorides^2)             0.7074 0.4003608    
## chlorides:free_sulfur_dioxide             3.0980 0.0784563 .  
## chlorides:total_sulfur_dioxide            8.4920 0.0035848 ** 
## chlorides:density                         0.0004 0.9841326    
## chlorides:pH                             34.8189 3.887e-09 ***
## chlorides:sulphates                      13.6366 0.0002245 ***
## chlorides:below_grade                     9.1763 0.0024658 ** 
## chlorides:above_grade                    12.6866 0.0003721 ***
## chlorides:I(residual_sugar^2)             4.8043 0.0284399 *  
## chlorides:I(chlorides^2)                  0.2132 0.6443137    
## free_sulfur_dioxide:total_sulfur_dioxide  0.3407 0.5594622    
## free_sulfur_dioxide:density               6.8771 0.0087603 ** 
## free_sulfur_dioxide:pH                    4.2368 0.0396136 *  
## free_sulfur_dioxide:sulphates             9.9791 0.0015939 ** 
## free_sulfur_dioxide:below_grade           0.0947 0.7583384    
## free_sulfur_dioxide:above_grade           0.0263 0.8712493    
## free_sulfur_dioxide:I(residual_sugar^2)   2.3255 0.1273414    
## free_sulfur_dioxide:I(chlorides^2)        1.0398 0.3079162    
## total_sulfur_dioxide:density             54.3010 2.039e-13 ***
## total_sulfur_dioxide:pH                   0.0113 0.9153002    
## total_sulfur_dioxide:sulphates            8.2592 0.0040739 ** 
## total_sulfur_dioxide:below_grade          0.1684 0.6815643    
## total_sulfur_dioxide:above_grade          0.1124 0.7374008    
## total_sulfur_dioxide:I(residual_sugar^2)  3.5218 0.0606307 .  
## total_sulfur_dioxide:I(chlorides^2)       1.6608 0.1975617    
## density:pH                               73.2575 < 2.2e-16 ***
## density:sulphates                        35.4621 2.801e-09 ***
## density:below_grade                      13.5594 0.0002339 ***
## density:above_grade                      21.7235 3.241e-06 ***
## density:I(residual_sugar^2)              12.8757 0.0003365 ***
## density:I(chlorides^2)                    0.9559 0.3282636    
## pH:sulphates                              5.5627 0.0183901 *  
## pH:below_grade                           15.5090 8.337e-05 ***
## pH:above_grade                           23.7699 1.124e-06 ***
## pH:I(residual_sugar^2)                    4.9167 0.0266479 *  
## pH:I(chlorides^2)                        10.2874 0.0013489 ** 
## sulphates:below_grade                     2.9181 0.0876604 .  
## sulphates:above_grade                     0.7774 0.3779811    
## sulphates:I(residual_sugar^2)             0.4737 0.4913229    
## sulphates:I(chlorides^2)                  2.0942 0.1479264    
## below_grade:above_grade                                       
## below_grade:I(residual_sugar^2)           0.0922 0.7614104    
## below_grade:I(chlorides^2)                1.9013 0.1680038    
## above_grade:I(residual_sugar^2)           2.7197 0.0991848 .  
## above_grade:I(chlorides^2)                1.6348 0.2011104    
## I(residual_sugar^2):I(chlorides^2)        1.3309 0.2487134    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Делаем вывод, что лучше удалить признаки above_grade,chlorides,total_sulfur_dioxide, но они могут оказаться хорошими в парах с некоторыми другими признаками. Из вывода выше, подберем самые лучшие пары и внесем их в модель: пары density:above_grade,pH:above_grade,chlorides:pH,type:total_sulfur_dioxide,total_sulfur_dioxide:density

m3 = lm(alcohol ~ type + constant_acidity + acetic_acidity + critic_acid_amount + residual_sugar + free_sulfur_dioxide + density + pH + sulphates + below_grade + I(residual_sugar^2) + I(chlorides^2) + density*above_grade + pH*above_grade + chlorides*pH + type*total_sulfur_dioxide + total_sulfur_dioxide*density, data=wine_data)

Снова произведем сравнение моделей

##Сравнение моделей
waldtest(m3, m2, vcov = vcovHC(m3, type = "HC0"))
## Wald test
## 
## Model 1: alcohol ~ type + constant_acidity + acetic_acidity + critic_acid_amount + 
##     residual_sugar + free_sulfur_dioxide + density + pH + sulphates + 
##     below_grade + I(residual_sugar^2) + I(chlorides^2) + density * 
##     above_grade + pH * above_grade + chlorides * pH + type * 
##     total_sulfur_dioxide + total_sulfur_dioxide * density
## Model 2: alcohol ~ type + constant_acidity + acetic_acidity + critic_acid_amount + 
##     residual_sugar + chlorides + free_sulfur_dioxide + total_sulfur_dioxide + 
##     density + pH + sulphates + below_grade + above_grade + I(residual_sugar^2) + 
##     I(chlorides^2)
##   Res.Df Df      F    Pr(>F)    
## 1   4448                        
## 2   4453 -5 30.167 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Получили значимо лучшую модель.

Вот графики остатков:

#Графики ошибок
plot(1:dim(wine_data)[1], rstandard(m3), xlab="i", ylab="Standardized residuals", col=mycol, pch=19)
addtrend(1:dim(wine_data)[1], rstandard(m3))
grid()

plot(fitted(m3), rstandard(m3), xlab="Fitted values", ylab="Standardized residuals", col=mycol, pch=19)
addtrend(fitted(m3), rstandard(m3))
grid()

for (col_name in colnames(wine_data)) {
    if (col_name != "alcohol" && col_name != "above_grade" && col_name != "chlorides" && col_name != "total_sulfur_dioxide") {
        plot(wine_data[,col_name], rstandard(m3), xlab=col_name, ylab="Standardized residuals", col=mycol, pch=19)
        addtrend(wine_data[,col_name], rstandard(m3))
        grid()
    }
}
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.0022e-28
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01

Считаем, что m3 – самая хорошая модель, которая у нас получилась. Чтобы сделать из нее человеческие выводы, посмотрим на ее коэффициенты

m3$coefficients
##                  (Intercept)                         type 
##                 1.405789e+00                 1.675644e-03 
##             constant_acidity               acetic_acidity 
##                 7.349085e-04                 5.447313e-04 
##           critic_acid_amount               residual_sugar 
##                 4.079470e-04                 3.129975e-04 
##          free_sulfur_dioxide                      density 
##                -4.102645e-06                -8.818967e-01 
##                           pH                    sulphates 
##                 3.225892e-03                 1.366362e-03 
##                  below_grade          I(residual_sugar^2) 
##                -1.898796e-04                -5.226072e-07 
##               I(chlorides^2)                  above_grade 
##                -2.114887e-03                -1.838907e-02 
##                    chlorides         total_sulfur_dioxide 
##                -2.329061e-02                 3.056929e-04 
##          density:above_grade               pH:above_grade 
##                 1.943490e-02                -2.915329e-04 
##                 pH:chlorides    type:total_sulfur_dioxide 
##                 7.336650e-03                -9.881718e-07 
## density:total_sulfur_dioxide 
##                -3.070120e-04

Видим, что наиболее весомый признак – плотность, причем с отрицательнным весом. Это логично, потому что плотность спирта меньше плотности воды, и поэтому этот признак довольно четко связан с алкогольностью. Второй по значимости признак – оценка, из которого мы видим, что чем менее алкогольное вино, тем вероятнее оно получит более высокую оценку.